{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cot import Collection\n",
    "from cot.stats import evaluation_as_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading worldtree...\n",
      "Downloading and preparing dataset worldtree_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/worldtree_dataset/thoughtsource/1.0.0/4ec0cd827b41f05891af9a27bf461fecd407e2fe7c1beebfed1eb00193c2cd52...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7ec71c210d514b0ebdad9c032e1050d6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef58f1243ea643899254cc145cb3daf1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e198001b110a436c8767e5fa37f33f0a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset worldtree_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/worldtree_dataset/thoughtsource/1.0.0/4ec0cd827b41f05891af9a27bf461fecd407e2fe7c1beebfed1eb00193c2cd52. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8649baeac1a2463a8f63dec958d3f184",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "wt = Collection(\"worldtree\", load_pregenerated_cots=True, generate_mode=\"recache\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "coll = Collection.from_json(\"/home/kon/work/ThoughtSource/notebooks/worldtree_10.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8d0e49dbd48f4510b7f3ec53565299f8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "coll.dump(\"/home/kon/work/ThoughtSource/notebooks/worldtree_10_new.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2bc8c755a6c149deacd3364260e9fcbf",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/10 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_0899b_row0_col2, #T_0899b_row1_col2 {\n",
       "  font-weight: bold;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_0899b\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_0899b_level0_col0\" class=\"col_heading level0 col0\" >None_kojima-01</th>\n",
       "      <th id=\"T_0899b_level0_col1\" class=\"col_heading level0 col1\" >None_kojima-02</th>\n",
       "      <th id=\"T_0899b_level0_col2\" class=\"col_heading level0 col2\" >None_kojima-03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th id=\"T_0899b_level1_col0\" class=\"col_heading level1 col0\" >text-davinci-003</th>\n",
       "      <th id=\"T_0899b_level1_col1\" class=\"col_heading level1 col1\" >text-davinci-003</th>\n",
       "      <th id=\"T_0899b_level1_col2\" class=\"col_heading level1 col2\" >text-davinci-003</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_0899b_level0_row0\" class=\"row_heading level0 row0\" >worldtree</th>\n",
       "      <td id=\"T_0899b_row0_col0\" class=\"data row0 col0\" >0.70</td>\n",
       "      <td id=\"T_0899b_row0_col1\" class=\"data row0 col1\" >0.70</td>\n",
       "      <td id=\"T_0899b_row0_col2\" class=\"data row0 col2\" >0.80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_0899b_level0_row1\" class=\"row_heading level0 row1\" >Average</th>\n",
       "      <td id=\"T_0899b_row1_col0\" class=\"data row1 col0\" >0.70</td>\n",
       "      <td id=\"T_0899b_row1_col1\" class=\"data row1 col1\" >0.70</td>\n",
       "      <td id=\"T_0899b_row1_col2\" class=\"data row1 col2\" >0.80</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f34b94e6a70>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval = coll.evaluate()\n",
    "table = evaluation_as_table(eval)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "177fabb4f22845f1a3a51de4eb50a5a9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/10 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_14cbd_row0_col1, #T_14cbd_row1_col1 {\n",
       "  font-weight: bold;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_14cbd\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_14cbd_level0_col0\" class=\"col_heading level0 col0\" >None_kojima-01</th>\n",
       "      <th id=\"T_14cbd_level0_col1\" class=\"col_heading level0 col1\" >None_kojima-02</th>\n",
       "      <th id=\"T_14cbd_level0_col2\" class=\"col_heading level0 col2\" >None_kojima-03</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th id=\"T_14cbd_level1_col0\" class=\"col_heading level1 col0\" >text-davinci-003</th>\n",
       "      <th id=\"T_14cbd_level1_col1\" class=\"col_heading level1 col1\" >text-davinci-003</th>\n",
       "      <th id=\"T_14cbd_level1_col2\" class=\"col_heading level1 col2\" >text-davinci-003</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_14cbd_level0_row0\" class=\"row_heading level0 row0\" >worldtree</th>\n",
       "      <td id=\"T_14cbd_row0_col0\" class=\"data row0 col0\" >0.70</td>\n",
       "      <td id=\"T_14cbd_row0_col1\" class=\"data row0 col1\" >0.80</td>\n",
       "      <td id=\"T_14cbd_row0_col2\" class=\"data row0 col2\" >0.80</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_14cbd_level0_row1\" class=\"row_heading level0 row1\" >Average</th>\n",
       "      <td id=\"T_14cbd_row1_col0\" class=\"data row1 col0\" >0.70</td>\n",
       "      <td id=\"T_14cbd_row1_col1\" class=\"data row1 col1\" >0.80</td>\n",
       "      <td id=\"T_14cbd_row1_col2\" class=\"data row1 col2\" >0.80</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f34ba443220>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval = coll.evaluate(overwrite=True)\n",
    "table = evaluation_as_table(eval)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2f2ca0121e3a402a975b0b908bef3bd5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "coll.dump(\"/home/kon/work/ThoughtSource/notebooks/worldtree_10_new_eval.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts = Collection.load_thoughtsource_100([\"open_book_qa\",\"worldtree\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts = Collection.from_json(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_100_own.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "| Name           | Train   | Valid   | Test   |\n",
       "|----------------|---------|---------|--------|\n",
       "| commonsense_qa | -       | 100     | -      |\n",
       "| med_qa         | -       | -       | 100    |\n",
       "| medmc_qa       | -       | 100     | -      |\n",
       "| open_book_qa   | -       | -       | 100    |\n",
       "| strategy_qa    | 100     | -       | -      |\n",
       "| worldtree      | -       | -       | 100    |\n",
       "\n",
       "Not loaded: ['aqua', 'asdiv', 'entailment_bank', 'gsm8k', 'mawps', 'pubmed_qa', 'qed', 'svamp']"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts.unload_datasets([\"commonsense_qa\", \"med_qa\", \"medmc_qa\", \"strategy_qa\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "| Name         | Train   | Valid   |   Test |\n",
       "|--------------|---------|---------|--------|\n",
       "| open_book_qa | -       | -       |    100 |\n",
       "| worldtree    | -       | -       |    100 |\n",
       "\n",
       "Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts.select_generated_cots(\n",
    "    author=\"thoughtsource\",\n",
    "    model=\"text-davinci-002\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ts.dump(\"/home/kon/work/ThoughtSource/libs/cot/cot/datasets/thoughtsource/thoughtsource_100_own.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "| Name         | Train   | Valid   |   Test |\n",
       "|--------------|---------|---------|--------|\n",
       "| open_book_qa | -       | -       |    100 |\n",
       "| worldtree    | -       | -       |    100 |\n",
       "\n",
       "Not loaded: ['aqua', 'asdiv', 'commonsense_qa', 'entailment_bank', 'gsm8k', 'mawps', 'med_qa', 'medmc_qa', 'pubmed_qa', 'qed', 'strategy_qa', 'svamp']"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts.dump(\"strategy_before\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'open_book_qa': {'test': {'accuracy': {'text-davinci-002': {'None_None_kojima-A-D': 0.67,\n",
       "     'None_kojima-01_kojima-A-D': 0.58}}}},\n",
       " 'worldtree': {'test': {'accuracy': {'text-davinci-002': {'None_None_kojima-A-D': 0.88,\n",
       "     'None_kojima-01_kojima-A-D': 0.79}}}}}"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ts.evaluate(overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "ts.dump(\"strategy_after\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kon/work/ThoughtSource/libs/cot/cot/stats.py:406: PerformanceWarning: indexing past lexsort depth may impact performance.\n",
      "  df.loc[dataset, (instruction + \"_\" + cot_trigger, model)] = v\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "#T_54318_row0_col1, #T_54318_row1_col15, #T_54318_row2_col15, #T_54318_row3_col15, #T_54318_row4_col15, #T_54318_row5_col3, #T_54318_row6_col15 {\n",
       "  font-weight: bold;\n",
       "}\n",
       "</style>\n",
       "<table id=\"T_54318\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th id=\"T_54318_level0_col0\" class=\"col_heading level0 col0\" colspan=\"6\">None_None</th>\n",
       "      <th id=\"T_54318_level0_col6\" class=\"col_heading level0 col6\" colspan=\"5\">None_kojima-01</th>\n",
       "      <th id=\"T_54318_level0_col11\" class=\"col_heading level0 col11\" >None_kojima-03</th>\n",
       "      <th id=\"T_54318_level0_col12\" class=\"col_heading level0 col12\" >None_kojima-09</th>\n",
       "      <th id=\"T_54318_level0_col13\" class=\"col_heading level0 col13\" colspan=\"3\">None_zhou-01</th>\n",
       "      <th id=\"T_54318_level0_col16\" class=\"col_heading level0 col16\" >qa-01_None</th>\n",
       "      <th id=\"T_54318_level0_col17\" class=\"col_heading level0 col17\" >qa-05_None</th>\n",
       "      <th id=\"T_54318_level0_col18\" class=\"col_heading level0 col18\" >qa-08_None</th>\n",
       "      <th id=\"T_54318_level0_col19\" class=\"col_heading level0 col19\" >qa-09_None</th>\n",
       "      <th id=\"T_54318_level0_col20\" class=\"col_heading level0 col20\" >qa-10_None</th>\n",
       "      <th id=\"T_54318_level0_col21\" class=\"col_heading level0 col21\" >qa-12_None</th>\n",
       "      <th id=\"T_54318_level0_col22\" class=\"col_heading level0 col22\" >qa-13_None</th>\n",
       "      <th id=\"T_54318_level0_col23\" class=\"col_heading level0 col23\" >qa-16_None</th>\n",
       "      <th id=\"T_54318_level0_col24\" class=\"col_heading level0 col24\" >qa-17_None</th>\n",
       "      <th id=\"T_54318_level0_col25\" class=\"col_heading level0 col25\" >zhou-01-ins_None</th>\n",
       "      <th id=\"T_54318_level0_col26\" class=\"col_heading level0 col26\" >zhou-01-ins_zhou-01</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th id=\"T_54318_level1_col0\" class=\"col_heading level1 col0\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_54318_level1_col1\" class=\"col_heading level1 col1\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_54318_level1_col2\" class=\"col_heading level1 col2\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col3\" class=\"col_heading level1 col3\" >gpt-4</th>\n",
       "      <th id=\"T_54318_level1_col4\" class=\"col_heading level1 col4\" >text-davinci-002</th>\n",
       "      <th id=\"T_54318_level1_col5\" class=\"col_heading level1 col5\" >text-davinci-003</th>\n",
       "      <th id=\"T_54318_level1_col6\" class=\"col_heading level1 col6\" >command-xlarge-nightly</th>\n",
       "      <th id=\"T_54318_level1_col7\" class=\"col_heading level1 col7\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_54318_level1_col8\" class=\"col_heading level1 col8\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col9\" class=\"col_heading level1 col9\" >text-davinci-002</th>\n",
       "      <th id=\"T_54318_level1_col10\" class=\"col_heading level1 col10\" >text-davinci-003</th>\n",
       "      <th id=\"T_54318_level1_col11\" class=\"col_heading level1 col11\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col12\" class=\"col_heading level1 col12\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col13\" class=\"col_heading level1 col13\" >flan-T5-xxl</th>\n",
       "      <th id=\"T_54318_level1_col14\" class=\"col_heading level1 col14\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col15\" class=\"col_heading level1 col15\" >gpt-4</th>\n",
       "      <th id=\"T_54318_level1_col16\" class=\"col_heading level1 col16\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col17\" class=\"col_heading level1 col17\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col18\" class=\"col_heading level1 col18\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col19\" class=\"col_heading level1 col19\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col20\" class=\"col_heading level1 col20\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col21\" class=\"col_heading level1 col21\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col22\" class=\"col_heading level1 col22\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col23\" class=\"col_heading level1 col23\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col24\" class=\"col_heading level1 col24\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col25\" class=\"col_heading level1 col25\" >gpt-3.5-turbo</th>\n",
       "      <th id=\"T_54318_level1_col26\" class=\"col_heading level1 col26\" >gpt-3.5-turbo</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row0\" class=\"row_heading level0 row0\" >commonsense_qa</th>\n",
       "      <td id=\"T_54318_row0_col0\" class=\"data row0 col0\" >0.51</td>\n",
       "      <td id=\"T_54318_row0_col1\" class=\"data row0 col1\" >0.87</td>\n",
       "      <td id=\"T_54318_row0_col2\" class=\"data row0 col2\" >0.72</td>\n",
       "      <td id=\"T_54318_row0_col3\" class=\"data row0 col3\" >0.75</td>\n",
       "      <td id=\"T_54318_row0_col4\" class=\"data row0 col4\" >0.76</td>\n",
       "      <td id=\"T_54318_row0_col5\" class=\"data row0 col5\" >0.72</td>\n",
       "      <td id=\"T_54318_row0_col6\" class=\"data row0 col6\" >0.53</td>\n",
       "      <td id=\"T_54318_row0_col7\" class=\"data row0 col7\" >0.81</td>\n",
       "      <td id=\"T_54318_row0_col8\" class=\"data row0 col8\" >0.67</td>\n",
       "      <td id=\"T_54318_row0_col9\" class=\"data row0 col9\" >0.62</td>\n",
       "      <td id=\"T_54318_row0_col10\" class=\"data row0 col10\" >0.65</td>\n",
       "      <td id=\"T_54318_row0_col11\" class=\"data row0 col11\" >0.63</td>\n",
       "      <td id=\"T_54318_row0_col12\" class=\"data row0 col12\" >0.70</td>\n",
       "      <td id=\"T_54318_row0_col13\" class=\"data row0 col13\" >0.83</td>\n",
       "      <td id=\"T_54318_row0_col14\" class=\"data row0 col14\" >0.66</td>\n",
       "      <td id=\"T_54318_row0_col15\" class=\"data row0 col15\" >0.72</td>\n",
       "      <td id=\"T_54318_row0_col16\" class=\"data row0 col16\" >0.66</td>\n",
       "      <td id=\"T_54318_row0_col17\" class=\"data row0 col17\" >0.69</td>\n",
       "      <td id=\"T_54318_row0_col18\" class=\"data row0 col18\" >0.62</td>\n",
       "      <td id=\"T_54318_row0_col19\" class=\"data row0 col19\" >0.64</td>\n",
       "      <td id=\"T_54318_row0_col20\" class=\"data row0 col20\" >0.68</td>\n",
       "      <td id=\"T_54318_row0_col21\" class=\"data row0 col21\" >0.63</td>\n",
       "      <td id=\"T_54318_row0_col22\" class=\"data row0 col22\" >0.61</td>\n",
       "      <td id=\"T_54318_row0_col23\" class=\"data row0 col23\" >0.58</td>\n",
       "      <td id=\"T_54318_row0_col24\" class=\"data row0 col24\" >0.66</td>\n",
       "      <td id=\"T_54318_row0_col25\" class=\"data row0 col25\" >0.72</td>\n",
       "      <td id=\"T_54318_row0_col26\" class=\"data row0 col26\" >0.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row1\" class=\"row_heading level0 row1\" >med_qa</th>\n",
       "      <td id=\"T_54318_row1_col0\" class=\"data row1 col0\" >0.23</td>\n",
       "      <td id=\"T_54318_row1_col1\" class=\"data row1 col1\" >0.32</td>\n",
       "      <td id=\"T_54318_row1_col2\" class=\"data row1 col2\" >0.58</td>\n",
       "      <td id=\"T_54318_row1_col3\" class=\"data row1 col3\" >0.73</td>\n",
       "      <td id=\"T_54318_row1_col4\" class=\"data row1 col4\" >0.41</td>\n",
       "      <td id=\"T_54318_row1_col5\" class=\"data row1 col5\" >0.43</td>\n",
       "      <td id=\"T_54318_row1_col6\" class=\"data row1 col6\" >0.31</td>\n",
       "      <td id=\"T_54318_row1_col7\" class=\"data row1 col7\" >0.34</td>\n",
       "      <td id=\"T_54318_row1_col8\" class=\"data row1 col8\" >0.59</td>\n",
       "      <td id=\"T_54318_row1_col9\" class=\"data row1 col9\" >0.34</td>\n",
       "      <td id=\"T_54318_row1_col10\" class=\"data row1 col10\" >0.43</td>\n",
       "      <td id=\"T_54318_row1_col11\" class=\"data row1 col11\" >0.59</td>\n",
       "      <td id=\"T_54318_row1_col12\" class=\"data row1 col12\" >0.51</td>\n",
       "      <td id=\"T_54318_row1_col13\" class=\"data row1 col13\" >0.27</td>\n",
       "      <td id=\"T_54318_row1_col14\" class=\"data row1 col14\" >0.65</td>\n",
       "      <td id=\"T_54318_row1_col15\" class=\"data row1 col15\" >0.76</td>\n",
       "      <td id=\"T_54318_row1_col16\" class=\"data row1 col16\" >0.54</td>\n",
       "      <td id=\"T_54318_row1_col17\" class=\"data row1 col17\" >0.53</td>\n",
       "      <td id=\"T_54318_row1_col18\" class=\"data row1 col18\" >0.46</td>\n",
       "      <td id=\"T_54318_row1_col19\" class=\"data row1 col19\" >0.55</td>\n",
       "      <td id=\"T_54318_row1_col20\" class=\"data row1 col20\" >0.56</td>\n",
       "      <td id=\"T_54318_row1_col21\" class=\"data row1 col21\" >0.59</td>\n",
       "      <td id=\"T_54318_row1_col22\" class=\"data row1 col22\" >0.49</td>\n",
       "      <td id=\"T_54318_row1_col23\" class=\"data row1 col23\" >0.60</td>\n",
       "      <td id=\"T_54318_row1_col24\" class=\"data row1 col24\" >0.56</td>\n",
       "      <td id=\"T_54318_row1_col25\" class=\"data row1 col25\" >0.54</td>\n",
       "      <td id=\"T_54318_row1_col26\" class=\"data row1 col26\" >0.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row2\" class=\"row_heading level0 row2\" >medmc_qa</th>\n",
       "      <td id=\"T_54318_row2_col0\" class=\"data row2 col0\" >0.25</td>\n",
       "      <td id=\"T_54318_row2_col1\" class=\"data row2 col1\" >0.34</td>\n",
       "      <td id=\"T_54318_row2_col2\" class=\"data row2 col2\" >0.58</td>\n",
       "      <td id=\"T_54318_row2_col3\" class=\"data row2 col3\" >0.69</td>\n",
       "      <td id=\"T_54318_row2_col4\" class=\"data row2 col4\" >0.34</td>\n",
       "      <td id=\"T_54318_row2_col5\" class=\"data row2 col5\" >0.40</td>\n",
       "      <td id=\"T_54318_row2_col6\" class=\"data row2 col6\" >0.22</td>\n",
       "      <td id=\"T_54318_row2_col7\" class=\"data row2 col7\" >0.35</td>\n",
       "      <td id=\"T_54318_row2_col8\" class=\"data row2 col8\" >0.47</td>\n",
       "      <td id=\"T_54318_row2_col9\" class=\"data row2 col9\" >0.34</td>\n",
       "      <td id=\"T_54318_row2_col10\" class=\"data row2 col10\" >0.36</td>\n",
       "      <td id=\"T_54318_row2_col11\" class=\"data row2 col11\" >0.50</td>\n",
       "      <td id=\"T_54318_row2_col12\" class=\"data row2 col12\" >0.50</td>\n",
       "      <td id=\"T_54318_row2_col13\" class=\"data row2 col13\" >0.31</td>\n",
       "      <td id=\"T_54318_row2_col14\" class=\"data row2 col14\" >0.48</td>\n",
       "      <td id=\"T_54318_row2_col15\" class=\"data row2 col15\" >0.70</td>\n",
       "      <td id=\"T_54318_row2_col16\" class=\"data row2 col16\" >0.47</td>\n",
       "      <td id=\"T_54318_row2_col17\" class=\"data row2 col17\" >0.42</td>\n",
       "      <td id=\"T_54318_row2_col18\" class=\"data row2 col18\" >0.45</td>\n",
       "      <td id=\"T_54318_row2_col19\" class=\"data row2 col19\" >0.47</td>\n",
       "      <td id=\"T_54318_row2_col20\" class=\"data row2 col20\" >0.49</td>\n",
       "      <td id=\"T_54318_row2_col21\" class=\"data row2 col21\" >0.53</td>\n",
       "      <td id=\"T_54318_row2_col22\" class=\"data row2 col22\" >0.41</td>\n",
       "      <td id=\"T_54318_row2_col23\" class=\"data row2 col23\" >0.48</td>\n",
       "      <td id=\"T_54318_row2_col24\" class=\"data row2 col24\" >0.53</td>\n",
       "      <td id=\"T_54318_row2_col25\" class=\"data row2 col25\" >0.44</td>\n",
       "      <td id=\"T_54318_row2_col26\" class=\"data row2 col26\" >0.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row3\" class=\"row_heading level0 row3\" >open_book_qa</th>\n",
       "      <td id=\"T_54318_row3_col0\" class=\"data row3 col0\" >0.59</td>\n",
       "      <td id=\"T_54318_row3_col1\" class=\"data row3 col1\" >0.82</td>\n",
       "      <td id=\"T_54318_row3_col2\" class=\"data row3 col2\" >0.77</td>\n",
       "      <td id=\"T_54318_row3_col3\" class=\"data row3 col3\" >0.92</td>\n",
       "      <td id=\"T_54318_row3_col4\" class=\"data row3 col4\" >0.67</td>\n",
       "      <td id=\"T_54318_row3_col5\" class=\"data row3 col5\" >0.70</td>\n",
       "      <td id=\"T_54318_row3_col6\" class=\"data row3 col6\" >0.38</td>\n",
       "      <td id=\"T_54318_row3_col7\" class=\"data row3 col7\" >0.79</td>\n",
       "      <td id=\"T_54318_row3_col8\" class=\"data row3 col8\" >0.77</td>\n",
       "      <td id=\"T_54318_row3_col9\" class=\"data row3 col9\" >0.57</td>\n",
       "      <td id=\"T_54318_row3_col10\" class=\"data row3 col10\" >0.67</td>\n",
       "      <td id=\"T_54318_row3_col11\" class=\"data row3 col11\" >0.73</td>\n",
       "      <td id=\"T_54318_row3_col12\" class=\"data row3 col12\" >0.73</td>\n",
       "      <td id=\"T_54318_row3_col13\" class=\"data row3 col13\" >0.81</td>\n",
       "      <td id=\"T_54318_row3_col14\" class=\"data row3 col14\" >0.81</td>\n",
       "      <td id=\"T_54318_row3_col15\" class=\"data row3 col15\" >0.95</td>\n",
       "      <td id=\"T_54318_row3_col16\" class=\"data row3 col16\" >0.73</td>\n",
       "      <td id=\"T_54318_row3_col17\" class=\"data row3 col17\" >0.65</td>\n",
       "      <td id=\"T_54318_row3_col18\" class=\"data row3 col18\" >0.73</td>\n",
       "      <td id=\"T_54318_row3_col19\" class=\"data row3 col19\" >0.71</td>\n",
       "      <td id=\"T_54318_row3_col20\" class=\"data row3 col20\" >0.73</td>\n",
       "      <td id=\"T_54318_row3_col21\" class=\"data row3 col21\" >0.80</td>\n",
       "      <td id=\"T_54318_row3_col22\" class=\"data row3 col22\" >0.72</td>\n",
       "      <td id=\"T_54318_row3_col23\" class=\"data row3 col23\" >0.69</td>\n",
       "      <td id=\"T_54318_row3_col24\" class=\"data row3 col24\" >0.69</td>\n",
       "      <td id=\"T_54318_row3_col25\" class=\"data row3 col25\" >0.76</td>\n",
       "      <td id=\"T_54318_row3_col26\" class=\"data row3 col26\" >0.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row4\" class=\"row_heading level0 row4\" >strategy_qa</th>\n",
       "      <td id=\"T_54318_row4_col0\" class=\"data row4 col0\" >0.52</td>\n",
       "      <td id=\"T_54318_row4_col1\" class=\"data row4 col1\" >0.61</td>\n",
       "      <td id=\"T_54318_row4_col2\" class=\"data row4 col2\" >0.57</td>\n",
       "      <td id=\"T_54318_row4_col3\" class=\"data row4 col3\" >0.71</td>\n",
       "      <td id=\"T_54318_row4_col4\" class=\"data row4 col4\" >0.36</td>\n",
       "      <td id=\"T_54318_row4_col5\" class=\"data row4 col5\" >0.53</td>\n",
       "      <td id=\"T_54318_row4_col6\" class=\"data row4 col6\" >0.59</td>\n",
       "      <td id=\"T_54318_row4_col7\" class=\"data row4 col7\" >0.69</td>\n",
       "      <td id=\"T_54318_row4_col8\" class=\"data row4 col8\" >0.56</td>\n",
       "      <td id=\"T_54318_row4_col9\" class=\"data row4 col9\" >0.38</td>\n",
       "      <td id=\"T_54318_row4_col10\" class=\"data row4 col10\" >0.54</td>\n",
       "      <td id=\"T_54318_row4_col11\" class=\"data row4 col11\" >0.52</td>\n",
       "      <td id=\"T_54318_row4_col12\" class=\"data row4 col12\" >0.58</td>\n",
       "      <td id=\"T_54318_row4_col13\" class=\"data row4 col13\" >0.59</td>\n",
       "      <td id=\"T_54318_row4_col14\" class=\"data row4 col14\" >0.59</td>\n",
       "      <td id=\"T_54318_row4_col15\" class=\"data row4 col15\" >0.80</td>\n",
       "      <td id=\"T_54318_row4_col16\" class=\"data row4 col16\" >0.44</td>\n",
       "      <td id=\"T_54318_row4_col17\" class=\"data row4 col17\" >0.43</td>\n",
       "      <td id=\"T_54318_row4_col18\" class=\"data row4 col18\" >0.58</td>\n",
       "      <td id=\"T_54318_row4_col19\" class=\"data row4 col19\" >0.62</td>\n",
       "      <td id=\"T_54318_row4_col20\" class=\"data row4 col20\" >0.56</td>\n",
       "      <td id=\"T_54318_row4_col21\" class=\"data row4 col21\" >0.50</td>\n",
       "      <td id=\"T_54318_row4_col22\" class=\"data row4 col22\" >0.64</td>\n",
       "      <td id=\"T_54318_row4_col23\" class=\"data row4 col23\" >0.63</td>\n",
       "      <td id=\"T_54318_row4_col24\" class=\"data row4 col24\" >0.58</td>\n",
       "      <td id=\"T_54318_row4_col25\" class=\"data row4 col25\" >0.52</td>\n",
       "      <td id=\"T_54318_row4_col26\" class=\"data row4 col26\" >0.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row5\" class=\"row_heading level0 row5\" >worldtree</th>\n",
       "      <td id=\"T_54318_row5_col0\" class=\"data row5 col0\" >0.63</td>\n",
       "      <td id=\"T_54318_row5_col1\" class=\"data row5 col1\" >0.85</td>\n",
       "      <td id=\"T_54318_row5_col2\" class=\"data row5 col2\" >0.96</td>\n",
       "      <td id=\"T_54318_row5_col3\" class=\"data row5 col3\" >0.99</td>\n",
       "      <td id=\"T_54318_row5_col4\" class=\"data row5 col4\" >0.88</td>\n",
       "      <td id=\"T_54318_row5_col5\" class=\"data row5 col5\" >0.91</td>\n",
       "      <td id=\"T_54318_row5_col6\" class=\"data row5 col6\" >0.58</td>\n",
       "      <td id=\"T_54318_row5_col7\" class=\"data row5 col7\" >0.80</td>\n",
       "      <td id=\"T_54318_row5_col8\" class=\"data row5 col8\" >0.93</td>\n",
       "      <td id=\"T_54318_row5_col9\" class=\"data row5 col9\" >0.78</td>\n",
       "      <td id=\"T_54318_row5_col10\" class=\"data row5 col10\" >0.89</td>\n",
       "      <td id=\"T_54318_row5_col11\" class=\"data row5 col11\" >0.95</td>\n",
       "      <td id=\"T_54318_row5_col12\" class=\"data row5 col12\" >0.95</td>\n",
       "      <td id=\"T_54318_row5_col13\" class=\"data row5 col13\" >0.83</td>\n",
       "      <td id=\"T_54318_row5_col14\" class=\"data row5 col14\" >0.92</td>\n",
       "      <td id=\"T_54318_row5_col15\" class=\"data row5 col15\" >0.99</td>\n",
       "      <td id=\"T_54318_row5_col16\" class=\"data row5 col16\" >0.95</td>\n",
       "      <td id=\"T_54318_row5_col17\" class=\"data row5 col17\" >0.74</td>\n",
       "      <td id=\"T_54318_row5_col18\" class=\"data row5 col18\" >0.92</td>\n",
       "      <td id=\"T_54318_row5_col19\" class=\"data row5 col19\" >0.91</td>\n",
       "      <td id=\"T_54318_row5_col20\" class=\"data row5 col20\" >0.95</td>\n",
       "      <td id=\"T_54318_row5_col21\" class=\"data row5 col21\" >0.92</td>\n",
       "      <td id=\"T_54318_row5_col22\" class=\"data row5 col22\" >0.96</td>\n",
       "      <td id=\"T_54318_row5_col23\" class=\"data row5 col23\" >0.91</td>\n",
       "      <td id=\"T_54318_row5_col24\" class=\"data row5 col24\" >0.92</td>\n",
       "      <td id=\"T_54318_row5_col25\" class=\"data row5 col25\" >0.96</td>\n",
       "      <td id=\"T_54318_row5_col26\" class=\"data row5 col26\" >0.96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_54318_level0_row6\" class=\"row_heading level0 row6\" >Average</th>\n",
       "      <td id=\"T_54318_row6_col0\" class=\"data row6 col0\" >0.46</td>\n",
       "      <td id=\"T_54318_row6_col1\" class=\"data row6 col1\" >0.64</td>\n",
       "      <td id=\"T_54318_row6_col2\" class=\"data row6 col2\" >0.70</td>\n",
       "      <td id=\"T_54318_row6_col3\" class=\"data row6 col3\" >0.80</td>\n",
       "      <td id=\"T_54318_row6_col4\" class=\"data row6 col4\" >0.57</td>\n",
       "      <td id=\"T_54318_row6_col5\" class=\"data row6 col5\" >0.62</td>\n",
       "      <td id=\"T_54318_row6_col6\" class=\"data row6 col6\" >0.44</td>\n",
       "      <td id=\"T_54318_row6_col7\" class=\"data row6 col7\" >0.63</td>\n",
       "      <td id=\"T_54318_row6_col8\" class=\"data row6 col8\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col9\" class=\"data row6 col9\" >0.50</td>\n",
       "      <td id=\"T_54318_row6_col10\" class=\"data row6 col10\" >0.59</td>\n",
       "      <td id=\"T_54318_row6_col11\" class=\"data row6 col11\" >0.65</td>\n",
       "      <td id=\"T_54318_row6_col12\" class=\"data row6 col12\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col13\" class=\"data row6 col13\" >0.61</td>\n",
       "      <td id=\"T_54318_row6_col14\" class=\"data row6 col14\" >0.68</td>\n",
       "      <td id=\"T_54318_row6_col15\" class=\"data row6 col15\" >0.82</td>\n",
       "      <td id=\"T_54318_row6_col16\" class=\"data row6 col16\" >0.63</td>\n",
       "      <td id=\"T_54318_row6_col17\" class=\"data row6 col17\" >0.58</td>\n",
       "      <td id=\"T_54318_row6_col18\" class=\"data row6 col18\" >0.63</td>\n",
       "      <td id=\"T_54318_row6_col19\" class=\"data row6 col19\" >0.65</td>\n",
       "      <td id=\"T_54318_row6_col20\" class=\"data row6 col20\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col21\" class=\"data row6 col21\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col22\" class=\"data row6 col22\" >0.64</td>\n",
       "      <td id=\"T_54318_row6_col23\" class=\"data row6 col23\" >0.65</td>\n",
       "      <td id=\"T_54318_row6_col24\" class=\"data row6 col24\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col25\" class=\"data row6 col25\" >0.66</td>\n",
       "      <td id=\"T_54318_row6_col26\" class=\"data row6 col26\" >0.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7f34aa65bdc0>"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "eval = ts.evaluate()\n",
    "table = evaluation_as_table(eval)\n",
    "table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_1 = ts.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "eval_2 = ts.evaluate(overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Key 'open_book_qa' has different values: {'test': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-A-D': 0.59, 'None_kojima-01_kojima-A-D': 0.38}, 'flan-T5-xxl': {'None_None_kojima-A-D': 0.82, 'None_kojima-01_kojima-A-D': 0.79, 'None_zhou-01_kojima-A-D': 0.81}, 'gpt-3.5-turbo': {'None_None_kojima-A-D': 0.77, 'None_kojima-01_kojima-A-D': 0.77, 'None_kojima-03_kojima-A-D': 0.73, 'None_kojima-09_kojima-A-D': 0.73, 'None_zhou-01_kojima-A-D': 0.81, 'qa-01_None_kojima-A-D': 0.73, 'qa-05_None_kojima-A-D': 0.65, 'qa-08_None_kojima-A-D': 0.73, 'qa-09_None_kojima-A-D': 0.71, 'qa-10_None_kojima-A-D': 0.73, 'qa-12_None_kojima-A-D': 0.8, 'qa-13_None_kojima-A-D': 0.72, 'qa-16_None_kojima-A-D': 0.69, 'qa-17_None_kojima-A-D': 0.69, 'zhou-01-ins_None_kojima-A-D': 0.76, 'zhou-01-ins_zhou-01_kojima-A-D': 0.74}, 'gpt-4': {'None_None_kojima-A-D': 0.92, 'None_zhou-01_kojima-A-D': 0.95}, 'text-davinci-002': {'None_None_kojima-A-D': 0.67, 'None_kojima-01_kojima-A-D': 0.57}, 'text-davinci-003': {'None_None_kojima-A-D': 0.7, 'None_kojima-01_kojima-A-D': 0.67}}}} (first) vs {'test': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-A-D': 0.59, 'None_kojima-01_kojima-A-D': 0.38}, 'flan-T5-xxl': {'None_None_kojima-A-D': 0.82, 'None_kojima-01_kojima-A-D': 0.79, 'None_zhou-01_kojima-A-D': 0.81}, 'gpt-3.5-turbo': {'None_None_kojima-A-D': 0.77, 'None_kojima-01_kojima-A-D': 0.77, 'None_kojima-03_kojima-A-D': 0.73, 'None_kojima-09_kojima-A-D': 0.73, 'None_zhou-01_kojima-A-D': 0.81, 'qa-01_None_kojima-A-D': 0.73, 'qa-05_None_kojima-A-D': 0.65, 'qa-08_None_kojima-A-D': 0.73, 'qa-09_None_kojima-A-D': 0.71, 'qa-10_None_kojima-A-D': 0.73, 'qa-12_None_kojima-A-D': 0.8, 'qa-13_None_kojima-A-D': 0.72, 'qa-16_None_kojima-A-D': 0.69, 'qa-17_None_kojima-A-D': 0.69, 'zhou-01-ins_None_kojima-A-D': 0.76, 'zhou-01-ins_zhou-01_kojima-A-D': 0.74}, 'gpt-4': {'None_None_kojima-A-D': 0.92, 'None_zhou-01_kojima-A-D': 0.95}, 'text-davinci-002': {'None_None_kojima-A-D': 0.67, 'None_kojima-01_kojima-A-D': 0.58}, 'text-davinci-003': {'None_None_kojima-A-D': 0.7, 'None_kojima-01_kojima-A-D': 0.67}}}} (second)\n",
      "Key 'worldtree' has different values: {'test': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-A-D': 0.63, 'None_kojima-01_kojima-A-D': 0.58}, 'flan-T5-xxl': {'None_None_kojima-A-D': 0.85, 'None_kojima-01_kojima-A-D': 0.8, 'None_zhou-01_kojima-A-D': 0.83}, 'gpt-3.5-turbo': {'None_None_kojima-A-D': 0.96, 'None_kojima-01_kojima-A-D': 0.93, 'None_kojima-03_kojima-A-D': 0.95, 'None_kojima-09_kojima-A-D': 0.95, 'None_zhou-01_kojima-A-D': 0.92, 'qa-01_None_kojima-A-D': 0.95, 'qa-05_None_kojima-A-D': 0.74, 'qa-08_None_kojima-A-D': 0.92, 'qa-09_None_kojima-A-D': 0.91, 'qa-10_None_kojima-A-D': 0.95, 'qa-12_None_kojima-A-D': 0.92, 'qa-13_None_kojima-A-D': 0.96, 'qa-16_None_kojima-A-D': 0.91, 'qa-17_None_kojima-A-D': 0.92, 'zhou-01-ins_None_kojima-A-D': 0.96, 'zhou-01-ins_zhou-01_kojima-A-D': 0.96}, 'gpt-4': {'None_None_kojima-A-D': 0.99, 'None_zhou-01_kojima-A-D': 0.99}, 'text-davinci-002': {'None_None_kojima-A-D': 0.88, 'None_kojima-01_kojima-A-D': 0.78}, 'text-davinci-003': {'None_None_kojima-A-D': 0.91, 'None_kojima-01_kojima-A-D': 0.89}}}} (first) vs {'test': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-A-D': 0.63, 'None_kojima-01_kojima-A-D': 0.58}, 'flan-T5-xxl': {'None_None_kojima-A-D': 0.85, 'None_kojima-01_kojima-A-D': 0.8, 'None_zhou-01_kojima-A-D': 0.83}, 'gpt-3.5-turbo': {'None_None_kojima-A-D': 0.96, 'None_kojima-01_kojima-A-D': 0.93, 'None_kojima-03_kojima-A-D': 0.95, 'None_kojima-09_kojima-A-D': 0.95, 'None_zhou-01_kojima-A-D': 0.92, 'qa-01_None_kojima-A-D': 0.95, 'qa-05_None_kojima-A-D': 0.74, 'qa-08_None_kojima-A-D': 0.92, 'qa-09_None_kojima-A-D': 0.91, 'qa-10_None_kojima-A-D': 0.95, 'qa-12_None_kojima-A-D': 0.92, 'qa-13_None_kojima-A-D': 0.96, 'qa-16_None_kojima-A-D': 0.91, 'qa-17_None_kojima-A-D': 0.92, 'zhou-01-ins_None_kojima-A-D': 0.96, 'zhou-01-ins_zhou-01_kojima-A-D': 0.96}, 'gpt-4': {'None_None_kojima-A-D': 0.99, 'None_zhou-01_kojima-A-D': 0.99}, 'text-davinci-002': {'None_None_kojima-A-D': 0.88, 'None_kojima-01_kojima-A-D': 0.79}, 'text-davinci-003': {'None_None_kojima-A-D': 0.91, 'None_kojima-01_kojima-A-D': 0.89}}}} (second)\n",
      "Key 'strategy_qa' has different values: {'train': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-yes-no': 0.52, 'None_kojima-01_kojima-yes-no': 0.59}, 'flan-T5-xxl': {'None_None_kojima-yes-no': 0.61, 'None_kojima-01_kojima-yes-no': 0.69, 'None_zhou-01_kojima-yes-no': 0.59}, 'gpt-3.5-turbo': {'None_None_kojima-yes-no': 0.57, 'None_kojima-01_kojima-yes-no': 0.56, 'None_kojima-03_kojima-yes-no': 0.52, 'None_kojima-09_kojima-yes-no': 0.58, 'None_zhou-01_kojima-yes-no': 0.59, 'qa-01_None_kojima-yes-no': 0.44, 'qa-05_None_kojima-yes-no': 0.43, 'qa-08_None_kojima-yes-no': 0.58, 'qa-09_None_kojima-yes-no': 0.62, 'qa-10_None_kojima-yes-no': 0.56, 'qa-12_None_kojima-yes-no': 0.5, 'qa-13_None_kojima-yes-no': 0.64, 'qa-16_None_kojima-yes-no': 0.63, 'qa-17_None_kojima-yes-no': 0.58, 'zhou-01-ins_None_kojima-yes-no': 0.52, 'zhou-01-ins_zhou-01_kojima-yes-no': 0.57}, 'gpt-4': {'None_None_kojima-yes-no': 0.71, 'None_zhou-01_kojima-yes-no': 0.8}, 'text-davinci-002': {'None_None_kojima-yes-no': 0.36, 'None_kojima-01_kojima-yes-no': 0.38}, 'text-davinci-003': {'None_None_kojima-yes-no': 0.53, 'None_kojima-01_kojima-yes-no': 0.54}}}} (first) vs {'train': {'accuracy': {'command-xlarge-nightly': {'None_None_kojima-yes-no': 0.52, 'None_kojima-01_kojima-yes-no': 0.59}, 'flan-T5-xxl': {'None_None_kojima-yes-no': 0.61, 'None_kojima-01_kojima-yes-no': 0.69, 'None_zhou-01_kojima-yes-no': 0.59}, 'gpt-3.5-turbo': {'None_None_kojima-yes-no': 0.57, 'None_kojima-01_kojima-yes-no': 0.56, 'None_kojima-03_kojima-yes-no': 0.52, 'None_kojima-09_kojima-yes-no': 0.58, 'None_zhou-01_kojima-yes-no': 0.59, 'qa-01_None_kojima-yes-no': 0.44, 'qa-05_None_kojima-yes-no': 0.43, 'qa-08_None_kojima-yes-no': 0.58, 'qa-09_None_kojima-yes-no': 0.62, 'qa-10_None_kojima-yes-no': 0.56, 'qa-12_None_kojima-yes-no': 0.5, 'qa-13_None_kojima-yes-no': 0.64, 'qa-16_None_kojima-yes-no': 0.63, 'qa-17_None_kojima-yes-no': 0.58, 'zhou-01-ins_None_kojima-yes-no': 0.52, 'zhou-01-ins_zhou-01_kojima-yes-no': 0.57}, 'gpt-4': {'None_None_kojima-yes-no': 0.71, 'None_zhou-01_kojima-yes-no': 0.8}, 'text-davinci-002': {'None_None_kojima-yes-no': 0.46, 'None_kojima-01_kojima-yes-no': 0.54}, 'text-davinci-003': {'None_None_kojima-yes-no': 0.53, 'None_kojima-01_kojima-yes-no': 0.54}}}} (second)\n"
     ]
    }
   ],
   "source": [
    "compare_dicts(eval_1, eval_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('eval_1.json', 'w') as f:\n",
    "    json.dump(eval_1, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('eval_2.json', 'w') as f:\n",
    "    json.dump(eval_2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading aqua...\n",
      "Downloading and preparing dataset aqua_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/aqua_dataset/thoughtsource/1.0.0/1e513577b1b5ccfcf97069f9f660ccdc6ebf4987495b5fc95402f12cd9c83a99...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bbdbccca35e74b15b6afcab70eae4866",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dea903a21c9f4ad3a88c7e6c7bd8e4ff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b6010dff10ba4a3db26b7d2b948362b5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "22edfbb16c3548b682dca925419fa168",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e32060f0dfc24d3fb89977d87b74c5ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset aqua_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/aqua_dataset/thoughtsource/1.0.0/1e513577b1b5ccfcf97069f9f660ccdc6ebf4987495b5fc95402f12cd9c83a99. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b8cd2ccaaeee4763ba920e431b375893",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9a511627b3224e408de2f3c739c22a05",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/97467 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f7357f7aae8c4cb89751f37184640993",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/254 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7671d43c3ff846ba89390bff2862eeab",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/254 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading asdiv...\n",
      "Downloading and preparing dataset asdiv_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/asdiv_dataset/thoughtsource/1.0.0/8d75df8ce5c6294c738da1b417c2493b2cc632e11b202a0823c93971fb8101fb...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "759bc456d7b146e2bb120e07bfcdabc1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a0fcb0a21a524db8be3f2a098c665b84",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "46e652de8e6d4825b98e6ec5a71cceea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset asdiv_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/asdiv_dataset/thoughtsource/1.0.0/8d75df8ce5c6294c738da1b417c2493b2cc632e11b202a0823c93971fb8101fb. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd8605cb5588465a8771f7c3e834b59d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab6e07f063c54c309c84bb8c0f8f3fe3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1218 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading commonsense_qa...\n",
      "Downloading and preparing dataset commonsense_qa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/commonsense_qa_dataset/thoughtsource/1.0.0/cd427a3573ffb55b035f31bc28bbe62e3eec600ea9bd3186f90bee36862c029a...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4b8a52b49e2b4c24b69938f7b43fd11a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "456bebfde7cd481b8d7cd32fdd5007ac",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a11ad7155f854eb3a6df0bd4163806fe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 kojima cots mapped.\n",
      "0 wei cots mapped.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a5a30755811346a6aeeb9816ea1f0575",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 kojima cots mapped.\n",
      "0 wei cots mapped.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "69e44fb1f93042f4b734fee8c22eca4a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: 'answer_from_choices'\n",
      "Loading entailment_bank...\n",
      "Downloading and preparing dataset entailment_bank_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/entailment_bank_dataset/thoughtsource/1.0.0/484622edfc1dec21806b69ef91c8b8b8a83f70ea20ba98bd4ca89caa73566656...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dde79acb30654ce2838230c9103e8829",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c026000f33034d62a0b4a1a14bf75653",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "39bc001932b8491087b9cb00efe5075e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset entailment_bank_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/entailment_bank_dataset/thoughtsource/1.0.0/484622edfc1dec21806b69ef91c8b8b8a83f70ea20ba98bd4ca89caa73566656. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ca4e01f4d8a84ee4a99dec4f58225da6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "83d5195d97964050a9c91285965985f0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1313 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a66cea6da4b841edb1bb3c6a1ad6b8ff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/340 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "51b8f16a407445beaeeb71baad912a24",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/187 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading gsm8k...\n",
      "Downloading and preparing dataset gsm8k_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/gsm8k_dataset/thoughtsource/1.0.0/75d8b11f06245c6b0e586227209915eefbbd845907359e859c4d09107045f1ec...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b93c5fdeeaf54e44b51468cf45b39741",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "539d277422024d0ebda38f404cee50f5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1e3b0a40d4424ddc8e4ed4f72d353d12",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "429c16efa1bb4b569eb541df1e235441",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset gsm8k_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/gsm8k_dataset/thoughtsource/1.0.0/75d8b11f06245c6b0e586227209915eefbbd845907359e859c4d09107045f1ec. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e8997aabd1841d69b739c24187e569f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d462d4669e6b452b9b9d370db5d3f9f2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/7473 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fcd1149e911e4ddc8ccfbed8b4358710",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1319 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading mawps...\n",
      "Downloading and preparing dataset mawps_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/mawps_dataset/thoughtsource/1.0.0/c1f05fb89b5a20a4d6b7129e7692a12abdc1ebf17f623d0598bd80d48d00ff77...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dcd80f5ab8964a7d8164791392396182",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a16bef90ab60412daeb6885263c4c990",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cb3a34bceb48461db6d0c1396ee22505",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset mawps_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/mawps_dataset/thoughtsource/1.0.0/c1f05fb89b5a20a4d6b7129e7692a12abdc1ebf17f623d0598bd80d48d00ff77. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "07b0cb06f92e4de98744644aef85f417",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ec838b400fe640caa6702af2f49b174f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1921 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading med_qa...\n",
      "Downloading and preparing dataset med_qa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/med_qa_dataset/thoughtsource/1.0.0/80feaf4e24940034debbd30deb9eeac25df0c6d1dfe2d8fb0cb497c9f549a35c...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "230259100f294780b0eb6b355b93ff86",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3324b0630221415690dc585f9112aba5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a5da7dd4ac64caea4d4dbd71b7b053d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "572605ab8e9147939ada31927dc29542",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Preparing Lievin CoTs: 100%|██████████| 6365/6365 [00:01<00:00, 3280.35it/s]\n",
      "Preparing Lievin CoTs v1: 100%|██████████| 1273/1273 [00:00<00:00, 1742.70it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: 'answer_from_choices'\n",
      "Loading medmc_qa...\n",
      "Downloading and preparing dataset med_mcqa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/med_mcqa_dataset/thoughtsource/1.0.0/c1ebfc6e86f8317b0891f26c2fec24ec6f3bfc269c44d7d3f0cb1d77553a925b...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "38cc986fa149439da0a814618fcfe3e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "00fa9f36fe5b4caebd5b0f65909c9838",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bed8ed7d790447c88c378b283e38bcb0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "16aab510061e42c2a5adafb4dc437cce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2803ccc91eae431497ba9a88c0ed4d73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Preparing Lievin CoTs: 100%|██████████| 5000/5000 [00:01<00:00, 3511.08it/s]\n",
      "Preparing Lievin CoTs v1: 100%|██████████| 1000/1000 [00:00<00:00, 1720.07it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: 'answer_from_choices'\n",
      "Loading open_book_qa...\n",
      "Downloading and preparing dataset open_book_qa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/open_book_qa_dataset/thoughtsource/1.0.0/b1163057b19c0b0be4f0f1104103cb49d35de01177baec5250adb5246116272f...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "03be3cf0f4694880895a983a6219a6d0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9e04c0f37abf4d808cd742b694a8a84a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "52807b294f484ddbabff6e1363d08304",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset open_book_qa_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/open_book_qa_dataset/thoughtsource/1.0.0/b1163057b19c0b0be4f0f1104103cb49d35de01177baec5250adb5246116272f. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e5d02ff57d4c48f1b27f135484603009",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ad6ca7cbb47d41319b378ee486197850",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4957 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2040efb4c93340498c4f7b97a921d04e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "95726f14e1524e2cab546482a9032e48",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading pubmed_qa...\n",
      "Downloading and preparing dataset pubmed_qa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/pubmed_qa_dataset/thoughtsource/1.0.0/d386f4f2eb85f5dd9a29acf397480883450d456ede8cb45d4a6f3dcf4577bd6f...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c902c4572fd44cf93662e4b93f9bcae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "39e4fff84c56455dae876d2c09d66eeb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8f7a6cd88eab4a028b06e65158e74147",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Preparing Lievin CoTs: 100%|██████████| 2500/2500 [00:00<00:00, 3122.32it/s]\n",
      "Preparing Lievin CoTs v1: 100%|██████████| 500/500 [00:00<00:00, 1678.49it/s]\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b8693620beaf4cf7887f6592b4d9efcb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Preparing Lievin CoTs: 100%|██████████| 2500/2500 [00:00<00:00, 18693.09it/s]\n",
      "Preparing Lievin CoTs v1: 100%|██████████| 500/500 [00:00<00:00, 6351.54it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: 'answer_from_choices'\n",
      "Loading qed...\n",
      "Downloading and preparing dataset qed_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/qed_dataset/thoughtsource/1.0.0/e47081997176b275c4a9ce5e36477ee4144ca1551b24b4d2ca4ddec0e8ba6aab...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9328db566d0a4971887762bec99d134c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "77555147d5a2447ba04e48d62f0c84b7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "49633204cb1241d6a1a9d54d1944b1a8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab33c3928ff148f7a9c3e81035352bdb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset qed_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/qed_dataset/thoughtsource/1.0.0/e47081997176b275c4a9ce5e36477ee4144ca1551b24b4d2ca4ddec0e8ba6aab. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "03c6a4ca56f84348919a04e2eb1d4e3f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "608d91cac7f446e58d1c3d9005b7e42b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/5154 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e7060e386aae4b6cafc07d173d850281",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1021 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading strategy_qa...\n",
      "Downloading and preparing dataset strategy_qa_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/strategy_qa_dataset/thoughtsource/1.0.0/d5c1b6c9b30c0f277778d3867bd1adb57fdd0f396784386aaf10f05c988b5d38...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "488f011290fa4680bd1c7cdbfee9612e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "873e2304650c4bbeabb31004d881af10",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7ed408ac11884a749433e15223602d24",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error: 'answer_from_choices'\n",
      "Loading svamp...\n",
      "Downloading and preparing dataset svamp_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/svamp_dataset/thoughtsource/1.0.0/c688058c1768afd1c1773a3c61472df4c84e72f703e9fbd7901513bcb49eea9d...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "04f4b3ccf2e7497098a68dde0e74e727",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset svamp_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/svamp_dataset/thoughtsource/1.0.0/c688058c1768afd1c1773a3c61472df4c84e72f703e9fbd7901513bcb49eea9d. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "592e4a40209043749b5eaaf73dd407f1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "83b2fe5aa9164004bf3dfc1be613ae23",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1000 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading worldtree...\n",
      "Downloading and preparing dataset worldtree_dataset/thoughtsource to /home/kon/.cache/huggingface/datasets/worldtree_dataset/thoughtsource/1.0.0/4ec0cd827b41f05891af9a27bf461fecd407e2fe7c1beebfed1eb00193c2cd52...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f01455e103b54fe1b7002093a9bf999d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ac594c207d664eccbdfbb06de159b3ec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef81c74389144e2986676b2d22708b2f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating validation split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset worldtree_dataset downloaded and prepared to /home/kon/.cache/huggingface/datasets/worldtree_dataset/thoughtsource/1.0.0/4ec0cd827b41f05891af9a27bf461fecd407e2fe7c1beebfed1eb00193c2cd52. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9e753785e51d411084f81329e3d53cc1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5641ab27dfe848f984c019c556665db5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2207 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5b232bac26b14de1ab3b0041f741592f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1664 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8d83d993a99f4fd6af9b355975d6b78a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/496 [00:00<?, ?ex/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "names = [\n",
    "    # 'aqua', \n",
    "    # 'asdiv', \n",
    "    'commonsense_qa', \n",
    "    'entailment_bank', \n",
    "    'gsm8k', \n",
    "    'mawps', \n",
    "    'med_qa', \n",
    "    'medmc_qa', \n",
    "    'open_book_qa', \n",
    "    'pubmed_qa', \n",
    "    'qed', \n",
    "    'strategy_qa', \n",
    "    'svamp', \n",
    "    'worldtree'\n",
    "    ]\n",
    "\n",
    "for name in names:\n",
    "    try:\n",
    "        coll = Collection(name, load_pregenerated_cots=False, generate_mode=\"recache\")\n",
    "    # catch any error\n",
    "    except Exception as e:\n",
    "        print(f\"Error: {e}\")\n",
    "        continue\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
